library(rethinking)
## Loading required package: rstan
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## Loading required package: StanHeaders
## rstan (Version 2.18.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## Loading required package: parallel
## rethinking (Version 1.59)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.2     ✔ purrr   0.3.2
## ✔ tidyr   0.8.3     ✔ dplyr   0.8.1
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ tibble  2.1.2     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks rstan::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ purrr::map()     masks rethinking::map()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(rstan)

0.1 Ch4 Linear Model

In this chapter, we will start our first statistical model, linear regression. In past statistics course, we use least square method to calculate the intercept and coefficient of the regression line. In QBS, we will again let samples do the estimation for us.

Folloing the textbook, we use the Howell1 dataset. Our goal is to build a model to describe the height data.

Create the data

data(Howell1)
d <- Howell1
d2 <- d[ d$age >= 18 , ]

Before using the LR model, we first look at the distribution of height data. It seems to follow a normal distribution.

d2 %>% 
  ggplot() +
  geom_histogram(aes(height))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

0.1.1 Gaussian Model

We want to use a gaussian model (normal distribution) to describe height. A normal distribution has two parameters mu and sigma. We will again use grid approximation and MCMC to estimate the 2 parameters.

0.1.1.1 Grid Approximation

We look at the result first then expain the code later.

Grid Approximation

Grid Approximation

  1. set the grid for both mu and sigma.
  2. set priors for mu and sigma.
  • mu ~ N(178, 20) (you change it from 20 to 2 to compare the difference)
  • sigma ~ unif(4, 9)
  1. calculate the model likelihood.
  2. calculate the posterior (prior * likelihood)

Code Explaination

# setting grid
grid.size = 100
mu.list <- seq( from=140, to=190 , length.out=grid.size )
sigma.list <- seq( from=4 , to=9 , length.out=grid.size )

post = expand.grid( mu=mu.list , sigma=sigma.list)
post
# prior
post = post %>% 
  mutate(prior_mu = 
           apply(., 1, function(x) dnorm(x = x[["mu"]], 
                                         mean = 178, 
                                         sd = 2, 
                                         log = T)),
         prior_sigma = 
           apply(., 1, function(x) dunif(x = x[["sigma"]],
                                         min = 0,
                                         max = 50,
                                         log = T)))
post
# Model likelihood

grid_function <- function(mu, sigma){
  dnorm(d2$height, mean = mu, sd = sigma, log = T) %>%
  sum() }

post = post %>% 
  mutate(model_likelihood = 
           apply(.,1, function(x) grid_function(x[["mu"]], x[["sigma"]]))
         ) %>% 
  mutate(prior_likelohood = prior_mu + prior_sigma,
         post_likelihood = model_likelihood + prior_mu + prior_sigma
         ) %>% 
  mutate(post_prob = exp(post_likelihood - max(post_likelihood)),
         model_prob = exp(model_likelihood - max(model_likelihood)),
         prior_prob = exp(prior_likelohood - max(prior_likelohood)))

post
# Plotting
{
plt1 = post %>% 
  ggplot()+
  geom_point(aes(mu, sigma), size=50/grid.size, shape=16)+
  ggtitle("raw grid")

plt2 = post %>% 
  ggplot()+
  geom_point(aes(mu, sigma, color=prior_prob), size=50/grid.size, shape=16) +
  scale_colour_gradientn(colours = topo.colors(10)) +
  ggtitle("Prior")

plt3 = post %>% 
  ggplot() +
  geom_point(aes(mu, sigma, color=model_prob), size=50/grid.size, shape=16) +
  scale_colour_gradientn(colours = topo.colors(10)) +
  ggtitle("Model likelihood")
  
plt4 = post %>% 
  ggplot() +
  geom_point(aes(mu, sigma, color=post_prob), size=50/grid.size, shape=16) +
  scale_colour_gradientn(colours = topo.colors(10)) +
  ggtitle("Grid Posterior")
}

grid.arrange(plt1,plt2,plt3,plt4, ncol=1)

0.1.1.2 MCMC

normal.model = "
data {
    int N;
    vector[N] x;
}
parameters {
    real mu;
    real sigma;
}
model {
    // prior
    mu ~ normal(178, 20);
    sigma ~ uniform(0, 50);

    // model
    x ~ normal(mu, sigma);
}
"
normal.data = list(N = nrow(d2), x = d2$height)
normal.fit = stan(model_code = normal.model, data = normal.data, iter = 5000, chains = 2)
normal.post = normal.fit %>% 
  as.data.frame() %>% 
  select(mu, sigma)

plt5 = normal.post %>% 
  ggplot(aes(mu, sigma)) +
  geom_bin2d(bins=100) +
  scale_fill_distiller(palette=1, direction=-1) +
  xlim(140, 190) +
  ylim(4, 9) +
  ggtitle("MCMC Posterior")

grid.arrange(plt4, plt5, ncol=1)
## Warning: Removed 1 rows containing non-finite values (stat_bin2d).

0.1.2 Simple Linear Regression

Starting from now, we will no longer use grid approximation since our problem becomes complicated.

Let’s add weight as our predictor to build a simple linear regression model.

  1. Data
  • x = weight
  • y = height
  1. Prior
  • alpha ~ normal(178, 20)
  • beta ~ normal(0, 10)
  • sigma ~ unif(0, 50)
  1. Model
  • mu = alpha + beta * (weight - mean(weight))
  • height ~ normal(mu, sigma) Following the process in the textbook, we de-mean weight data. So our actual x in the model is weight - mean(weight) Then, we should check our prior.
d2$weight.c = d2$weight - mean(d2$weight)

0.1.2.1 Uneducated Prior

We started with an uneducated prior: * alpha ~ normal(178, 20) * beta ~ normal(0, 10)

We can plot the suggested regression lines by the prior by excuating these codes.

prior.data = tibble(
  alpha = rnorm(100, mean = 178, sd=20),
  beta  = rnorm(100, mean = 0, sd = 10))

p = ggplot(data = NULL)

for (i in 1:100) {
  d2$pred_height = 
    prior.data[[i,"alpha"]] + 
    prior.data[[i,"beta"]] * (d2$weight - mean(d2$weight))
  
  p = p +
    geom_line(data=d2, aes(weight, pred_height), alpha=.3)
}

p

0.1.2.2 Better Prior

We can add some of our common sense into our prior. For example, we know that weight can’t be negatively related with height. So we use a lognormal prior to prevent negative coefficients. The resulting prior and suggested lines now become:

  • alpha ~ normal(178, 20)
  • beta ~ log-normal(0, 1) (same as log(beta) ~ normal(0, 1))
  • sigma ~ unif(0, 50)
prior.data2 = tibble(
  alpha = rnorm(100, mean = 178, sd=20),
  beta  = rlnorm(100, mean = 0, sd = 1)) # the only thing changed


p2 = ggplot(data = NULL)

for (i in 1:100) {
  d2$pred_height = 
    prior.data2[[i,"alpha"]] + 
    prior.data2[[i,"beta"]] * (d2$weight - mean(d2$weight))
  
  p2 = p2 +
    geom_line(data=d2, aes(weight, pred_height), alpha=.3)
}

p2

There is no correct prior. Priors are information we know before seeing data. Here we know that weight should not be negatively related to height. Thus a log-normal prior is provided.

When we don’t have such information, we still usually know enough about the plausible range of values.

We can try different priors to choose a reasonable one. More importantly, when we have lots of data, model likelihood will dominate the posterior.

0.1.2.3 Model fitting

LR.model = "
data {
    int N;
    vector[N] x;
    vector[N] y;
}

parameters {
    real alpha;
    real<lower=0> beta;
    real sigma;
}

model {
    vector[N] mu = alpha + beta * x;
  // prior
    alpha ~ normal(178, 20);
    beta ~ lognormal(0, 1);
  sigma ~ uniform(0,50);

    // model
  
    y ~ normal(mu, sigma);
}


generated quantities {
    real pred_y[N];
  vector[N] mu = alpha + beta * x;

    mu = alpha + beta * x;
    pred_y = normal_rng(mu, sigma);
}
"

LR.data = list(
  N = nrow(d2),
  x = d2$weight.c,
  y = d2$height
)
LR.fit = stan(model_code = LR.model,
              data = LR.data,
              chains = 2,
              iter = 1000,
              cores = 2)
LR.post = as.data.frame(LR.fit)

0.1.3 Interpreating Posterior

Once you have more than a couple of parameters in a model, it is very hard to figure out from numbers alone how all of them act to influence prediction.

The author emphasizes plotting posterior distributions and posterior predictions, instead of attempting to understand a table.

0.1.3.1 Confidence Interval

The interval for mean prediction on each weight.

pred_mu = LR.post %>% 
  select(contains("mu"))

CI = data.frame(
  mean = pred_mu %>% apply(., 2, mean),
  L_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[1,],
  H_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[2,],
  weight = d2$weight)

CI %>% 
  ggplot() +
  geom_point(data = d2, aes(weight, height), color="blue", alpha=.3) +
  geom_line(aes(weight, mean)) +
  geom_ribbon(aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.3) +
  ggtitle("89% Prediction Interval")

0.1.3.2 Prediction Interval

The interval for one prediction on each weight.

pred_y = LR.post %>% 
  select(contains("pred_y"))

PI = data.frame(
  mean = pred_y %>% apply(., 2, mean),
  L_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[1,],
  H_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[2,],
  weight = d2$weight)
  
PI %>% 
  ggplot() +
  geom_point(data = d2, aes(weight, height), color="blue", alpha=.3) +
  geom_line(aes(weight, mean)) +
  geom_ribbon(aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.3) +
  ggtitle("89% Prediction Interval")

0.1.3.3 CI vs PI

  • CI: Consider the uncertainty of alpha and beta.
  • The posterior distribution is a ranking of the relative plausibilities of every possible combina- tion of parameter values.

  • PI: Consider the uncertainty of alpha, beta and sigma.
  • The distribution of height, is a distribution that includes sampling variation from some process that generates Gaussian random variables.

ggplot()+
  geom_point(data = d2, aes(weight, height), color="blue", alpha=.3) +
  geom_line(data=CI, aes(weight, mean)) +
  geom_ribbon(data=CI, aes(x=weight, ymin=L_HPDI, ymax=H_HPDI), alpha=.5) +
  geom_ribbon(data=PI, aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.2)

0.1.4 Polynomial Regression

Let’s try to standardize weight and plug in full data to see if we can get better result. Surprisingly, after standardizing x, the relationship seems to be a quadratic curve. We can try to fit a PLR model.

d %>% 
  ggplot() +
  geom_point(aes(scale(weight), height))

PLR.model = "
data {
    int N;
    vector[N] x;
    vector[N] y;
}
parameters {
    real alpha;
    real<lower=0> beta1;
    real beta2; // add another coefficient
    real sigma;
}
model {
  vector[N] mu;
    // prior
    alpha ~ normal(178, 20);
    beta1 ~ lognormal(0, 1);
    beta2 ~ normal(0, 10); // add another prior
    sigma ~ uniform(0, 50);

    // model
    mu = alpha + beta1 * x + beta2 * (x .* x); // .* : elementwise product
    y ~ normal(mu, sigma);
}

generated quantities {
    real pred_y[N];
  vector[N] mu;

    mu = alpha + beta1 * x + beta2 * (x .* x); // .* : elementwise product
    pred_y = normal_rng(mu, sigma);
}
"
PLR.data = list(
  N = nrow(d),
  x = scale(d$weight)[,1],
  y = d$height
)

PLR.fit = stan(model_code = PLR.model, 
               data = PLR.data, 
               chains = 2, 
               cores = 2,
               iter = 1000)

Check the result

print(PLR.fit, pars = c("alpha", "beta1", "beta2", "sigma"), probs=c(.25, .5, .75))
## Inference for Stan model: 1d9d19738f8d6840f4cd6604c250fa97.
## 2 chains, each with iter=1000; warmup=500; thin=1; 
## post-warmup draws per chain=500, total post-warmup draws=1000.
## 
##         mean se_mean   sd    25%    50%    75% n_eff Rhat
## alpha 146.69    0.01 0.38 146.42 146.68 146.96   679 1.00
## beta1  21.39    0.01 0.29  21.20  21.40  21.56   424 1.01
## beta2  -8.43    0.01 0.29  -8.62  -8.42  -8.22   593 1.00
## sigma   5.79    0.01 0.18   5.67   5.78   5.91   529 1.00
## 
## Samples were drawn using NUTS(diag_e) at Wed Aug  7 14:58:31 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at 
## convergence, Rhat=1).
PLR.post = as.data.frame(PLR.fit)

0.1.4.1 CI & PI

The only thing we need to do is to change LR.post to PLR.post.

Data for CI:

# CI
pred_mu = PLR.post %>% 
  select(contains("mu"))

CI = data.frame(
  mean = pred_mu %>% apply(., 2, mean),
  L_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[1,],
  H_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[2,],
  weight = d$weight)

Data for PI:

pred_y = PLR.post %>% 
  select(contains("pred_y"))

PI = data.frame(
  mean = pred_y %>% apply(., 2, mean),
  L_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[1,],
  H_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[2,],
  weight = d$weight)

plotting the result

ggplot()+
  geom_point(data = d, aes(weight, height), color="blue", alpha=.3) +
  geom_line(data=CI, aes(weight, mean)) +
  geom_ribbon(data=CI, aes(x=weight, ymin=L_HPDI, ymax=H_HPDI), alpha=.6) +
  geom_ribbon(data=PI, aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.2)